#function to resize plots later
fig <- function(width, heigth){options(repr.plot.width = width, repr.plot.height = heigth)}
conn <- dbConnect(RSQLite::SQLite(), "../spotify_analysis/charts.sqlite")
df <- tbl(conn, "top200") %>% collect()
dbDisconnect(conn)
df <- df %>%
separate_rows(artist, sep = ", ") %>%
mutate(date = ymd(date), trend = factor(trend))
summary(df)
## title rank date artist
## Length:447644 Min. : 1.0 Min. :2017-01-01 Length:447644
## Class :character 1st Qu.: 51.0 1st Qu.:2018-02-20 Class :character
## Mode :character Median :100.0 Median :2019-03-26 Mode :character
## Mean :100.2 Mean :2019-03-23
## 3rd Qu.:150.0 3rd Qu.:2020-04-19
## Max. :200.0 Max. :2021-06-15
## url trend streams
## Length:447644 MOVE_DOWN :192850 Min. : 325951
## Class :character MOVE_UP :176767 1st Qu.: 704685
## Mode :character NEW_ENTRY : 19818 Median : 900287
## SAME_POSITION: 58209 Mean : 1183243
## 3rd Qu.: 1357400
## Max. :17223237
#check for null values
df %>% select(artist) %>% is.na() %>% any()
## [1] FALSE
df %>%
select(artist) %>%
arrange(artist) %>%
unique() %>%
head()
## # A tibble: 6 x 1
## artist
## <chr>
## 1 ""
## 2 "-"
## 3 "$NOT"
## 4 "(G)I-DLE"
## 5 "*NSYNC"
## 6 "070 Shake"
df %>% filter(artist %in% c('-', ''))
## # A tibble: 20 x 7
## title rank date artist url trend streams
## <chr> <int> <date> <chr> <chr> <fct> <int>
## 1 "MLK Interlude" 148 2021-03-20 "-" https://open.spotify.c~ MOVE~ 886362
## 2 "MLK Interlude" 70 2021-03-19 "-" https://open.spotify.c~ NEW_~ 1426117
## 3 "" 156 2017-12-01 "" https://open.spotify.c~ NEW_~ 650496
## 4 "" 181 2017-12-01 "" https://open.spotify.c~ NEW_~ 588738
## 5 "" 30 2017-11-14 "" https://open.spotify.c~ MOVE~ 1581132
## 6 "" 31 2017-11-13 "" https://open.spotify.c~ MOVE~ 1473457
## 7 "" 27 2017-11-12 "" https://open.spotify.c~ MOVE~ 1544175
## 8 "" 26 2017-11-11 "" https://open.spotify.c~ MOVE~ 1762307
## 9 "" 30 2017-11-10 "" https://open.spotify.c~ MOVE~ 1660185
## 10 "" 32 2017-11-09 "" https://open.spotify.c~ MOVE~ 1496980
## 11 "" 89 2017-07-23 "" https://open.spotify.c~ MOVE~ 690247
## 12 "" 101 2017-07-22 "" https://open.spotify.c~ NEW_~ 747893
## 13 "" 5 2017-07-21 "" https://open.spotify.c~ SAME~ 3653533
## 14 "" 16 2017-07-21 "" https://open.spotify.c~ MOVE~ 2522453
## 15 "" 35 2017-07-21 "" https://open.spotify.c~ MOVE~ 1798890
## 16 "" 48 2017-07-21 "" https://open.spotify.c~ MOVE~ 1526955
## 17 "" 5 2017-07-20 "" https://open.spotify.c~ NEW_~ 3568811
## 18 "" 13 2017-07-20 "" https://open.spotify.c~ NEW_~ 2571960
## 19 "" 30 2017-07-20 "" https://open.spotify.c~ NEW_~ 1798208
## 20 "" 39 2017-07-20 "" https://open.spotify.c~ NEW_~ 1520291
We get a bunch of entries which we have to delete from the Dataset
df <- df %>% filter(!(artist %in% c('-', '')))
Checking if it worked:
df %>%
select(artist) %>%
arrange(artist) %>%
unique() %>%
head()
## # A tibble: 6 x 1
## artist
## <chr>
## 1 $NOT
## 2 (G)I-DLE
## 3 *NSYNC
## 4 070 Shake
## 5 13 Organisé
## 6 187 Strassenbande
Looks good :-) No more empty entries
Which artists are in the Top 200?
df %>%
select(artist) %>%
distinct() %>%
nrow()
## [1] 1594
So there are 1594 artists in the Top 200 Dataset.
Let’s explore which of these artists appeared the most often in the Top 200
df %>%
select(artist) %>%
group_by(artist) %>%
count() %>%
arrange(desc(n))
## # A tibble: 1,594 x 2
## # Groups: artist [1,594]
## artist n
## <chr> <int>
## 1 Ed Sheeran 9576
## 2 Post Malone 9542
## 3 J Balvin 9421
## 4 Bad Bunny 8471
## 5 Drake 7293
## 6 Ozuna 7040
## 7 XXXTENTACION 6785
## 8 Billie Eilish 6582
## 9 Travis Scott 5586
## 10 Ariana Grande 5282
## # ... with 1,584 more rows
Ed Sheeran, Post Malone and J Balvin sit comfortably on top of the list. But it’s a close race between them. But who got the most streams?
num_streams <- df %>%
select(artist, streams) %>%
group_by(artist) %>%
summarise(num_streams= sum(streams)) %>%
arrange(desc(num_streams))
num_streams %>% head(10)
## # A tibble: 10 x 2
## artist num_streams
## <chr> <dbl>
## 1 Post Malone 13976842359
## 2 Bad Bunny 11835167604
## 3 J Balvin 11695364249
## 4 Ed Sheeran 11475793196
## 5 Drake 9934735017
## 6 Billie Eilish 8674898986
## 7 Ariana Grande 8271296865
## 8 Ozuna 7713621696
## 9 XXXTENTACION 7063878971
## 10 Dua Lipa 6833617322
Seems like Bad Bunny has some very popular songs. Now we look into the number of songs each artist has in the Dataset. Do the artists with the most streams also have to most tracks?
num_tracks <- df %>%
select(artist, title) %>%
unique() %>%
group_by(artist) %>%
count(name = "num_tracks") %>%
arrange(desc(num_tracks))
num_tracks %>% head(10)
## # A tibble: 10 x 2
## # Groups: artist [10]
## artist num_tracks
## <chr> <int>
## 1 Drake 101
## 2 Future 99
## 3 Taylor Swift 96
## 4 Bad Bunny 90
## 5 BTS 78
## 6 Juice WRLD 73
## 7 Lil Uzi Vert 71
## 8 Eminem 66
## 9 J Balvin 66
## 10 Logic 62
df_join <- num_streams %>% inner_join(num_tracks)
## Joining, by = "artist"
head(df_join)
## # A tibble: 6 x 3
## artist num_streams num_tracks
## <chr> <dbl> <int>
## 1 Post Malone 13976842359 46
## 2 Bad Bunny 11835167604 90
## 3 J Balvin 11695364249 66
## 4 Ed Sheeran 11475793196 50
## 5 Drake 9934735017 101
## 6 Billie Eilish 8674898986 29
ggplot(df_join, aes(df_join$num_streams, df_join$num_tracks)) + geom_point() + geom_smooth(method = lm) + labs(x = "Stream Count", y = "Track Count", title = "Correlation Track and Stream Count")
## Warning: Use of `df_join$num_streams` is discouraged. Use `num_streams` instead.
## Warning: Use of `df_join$num_tracks` is discouraged. Use `num_tracks` instead.
## Warning: Use of `df_join$num_streams` is discouraged. Use `num_streams` instead.
## Warning: Use of `df_join$num_tracks` is discouraged. Use `num_tracks` instead.
## `geom_smooth()` using formula 'y ~ x'
cor.test(df_join$num_streams, df_join$num_tracks)
##
## Pearson's product-moment correlation
##
## data: df_join$num_streams and df_join$num_tracks
## t = 43.189, df = 1592, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7110645 0.7563409
## sample estimates:
## cor
## 0.7345191
Technically there is a high correlation. But if we look at the graph, we can see that that’s probably the case because of the high density around 0. There is no linear trend visible. We can observe a high correlation between the number of songs and the number of streams.
df %>%
select(artist, date) %>%
group_by(date) %>%
count(artist) %>%
arrange(desc(n)) %>%
head(10)
## # A tibble: 10 x 3
## # Groups: date [10]
## date artist n
## <date> <chr> <int>
## 1 2018-06-19 XXXTENTACION 37
## 2 2017-07-21 Linkin Park 34
## 3 2017-03-19 Drake 30
## 4 2017-03-20 Drake 29
## 5 2017-03-21 Drake 29
## 6 2017-03-22 Drake 29
## 7 2017-03-23 Drake 29
## 8 2018-06-29 Drake 29
## 9 2020-03-13 Lil Uzi Vert 29
## 10 2018-06-20 XXXTENTACION 28
XXXTENTACION had 37 songs at once in the Top 200 on the 19th June 2018. He got killed on the 18th June 2018 which probably explains this. Linkin Park had 34 Songs at once in the Top 200 on the 21th July 2017. One day before that their vocalist Chester Bennington commited suicide. Music unites people in bad (but also in good!) times…
df %>%
select(-c(url, trend)) %>%
arrange(desc(streams)) %>%
head(20)
## # A tibble: 20 x 5
## title rank date artist streams
## <chr> <int> <date> <chr> <int>
## 1 All I Want for Christmas Is You 1 2020-12-24 Mariah ~ 1.72e7
## 2 Last Christmas 2 2020-12-24 Wham! 1.58e7
## 3 drivers license 1 2021-01-15 Olivia ~ 1.37e7
## 4 drivers license 1 2021-01-14 Olivia ~ 1.37e7
## 5 drivers license 1 2021-01-13 Olivia ~ 1.36e7
## 6 All I Want for Christmas Is You 1 2020-12-25 Mariah ~ 1.36e7
## 7 drivers license 1 2021-01-12 Olivia ~ 1.34e7
## 8 good 4 u 1 2021-05-25 Olivia ~ 1.26e7
## 9 good 4 u 1 2021-05-26 Olivia ~ 1.24e7
## 10 Last Christmas 2 2020-12-25 Wham! 1.24e7
## 11 good 4 u 1 2021-05-27 Olivia ~ 1.23e7
## 12 Santa Tell Me 3 2020-12-24 Ariana ~ 1.22e7
## 13 good 4 u 1 2021-05-24 Olivia ~ 1.22e7
## 14 good 4 u 1 2021-05-21 Olivia ~ 1.22e7
## 15 drivers license 1 2021-01-16 Olivia ~ 1.21e7
## 16 All I Want for Christmas Is You 1 2019-12-24 Mariah ~ 1.20e7
## 17 It's Beginning to Look a Lot like Christmas 4 2020-12-24 Michael~ 1.20e7
## 18 Jingle Bell Rock 5 2020-12-24 Bobby H~ 1.19e7
## 19 Rockin' Around The Christmas Tree 6 2020-12-24 Brenda ~ 1.18e7
## 20 good 4 u 1 2021-05-28 Olivia ~ 1.18e7
It makes sense that most of those songs are popular christmas hits! But also Olivia Rodrigo seems to be very popular Let’s have a closer look at her:
df %>%
arrange(desc(streams)) %>%
head(50) %>%
filter(artist == "Olivia Rodrigo") %>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 28
df %>%
arrange(desc(streams))%>%
head(50) %>%
group_by(artist) %>%
count() %>%
arrange(desc(n))
## # A tibble: 14 x 2
## # Groups: artist [14]
## artist n
## <chr> <int>
## 1 Olivia Rodrigo 28
## 2 Mariah Carey 4
## 3 Wham! 3
## 4 Ariana Grande 2
## 5 Bobby Helms 2
## 6 Brenda Lee 2
## 7 Michael Bublé 2
## 8 Andy Williams 1
## 9 Bad Bunny 1
## 10 BTS 1
## 11 Ed Sheeran 1
## 12 Jhay Cortez 1
## 13 Luis Fonsi 1
## 14 XXXTENTACION 1
Crazy, more than 50% of the 50 most streamed songs are from Olivia Rodrigo!
df %>%
filter(artist == "Olivia Rodrigo") %>%
arrange(date) %>%
head(1)
## # A tibble: 1 x 7
## title rank date artist url trend streams
## <chr> <int> <date> <chr> <chr> <fct> <int>
## 1 "All I Want - From \"High School ~ 192 2020-01-06 Olivi~ http~ NEW_~ 642819
The first time she appeared on the charts was 2020-01-06. Apparently she got famous through high school musical. Let’s see how she did from there.
df %>%
filter(artist == "Olivia Rodrigo") %>%
ggplot(aes(y = rank, x = date, color = title)) + geom_line(show.legend = FALSE) + scale_y_reverse() + theme_minimal()
So we see that she uninterrupted stayed in the charts since her first song. The longest with her first song. The songs after that stayed shorter in top ranks.
df %>%
filter(artist == "Olivia Rodrigo") %>%
summarise(streams_total = sum(streams))
## # A tibble: 1 x 1
## streams_total
## <int>
## 1 2087769350
df %>%
filter(artist == "Olivia Rodrigo") %>%
select(title) %>%
unique() %>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 12
df %>%
filter(artist == "Olivia Rodrigo") %>%
group_by(title) %>%
summarise(total_streams = sum(streams)) %>%
arrange(desc(total_streams))
## # A tibble: 12 x 2
## title total_streams
## <chr> <int>
## 1 "drivers license" 813710554
## 2 "good 4 u" 317055897
## 3 "deja vu" 297460035
## 4 "traitor" 106119949
## 5 "happier" 86403906
## 6 "brutal" 82455487
## 7 "favorite crime" 82363485
## 8 "enough for you" 71417140
## 9 "jealousy, jealousy" 64540563
## 10 "1 step forward, 3 steps back" 62314847
## 11 "All I Want - From \"High School Musical: The Musical: The Ser~ 53199370
## 12 "hope ur ok" 50728117
df %>%
filter(artist == "Olivia Rodrigo") %>%
group_by(title) %>%
summarise(total_streams_percent = sum(streams)) %>%
mutate(total_streams_percent = total_streams_percent / sum(total_streams_percent) * 100) %>%
arrange(desc(total_streams_percent))
## # A tibble: 12 x 2
## title total_streams_p~
## <chr> <dbl>
## 1 "drivers license" 39.0
## 2 "good 4 u" 15.2
## 3 "deja vu" 14.2
## 4 "traitor" 5.08
## 5 "happier" 4.14
## 6 "brutal" 3.95
## 7 "favorite crime" 3.95
## 8 "enough for you" 3.42
## 9 "jealousy, jealousy" 3.09
## 10 "1 step forward, 3 steps back" 2.98
## 11 "All I Want - From \"High School Musical: The Musical: The ~ 2.55
## 12 "hope ur ok" 2.43
df %>%
filter(artist == "Olivia Rodrigo") %>%
select(title, streams) %>%
group_by(title) %>%
ggplot(aes(x = title, y = streams, fill = title)) + geom_col(show.legend = FALSE) + theme_minimal()
Well, this does not look very clean… Let’s try out the plotly
Library
library(plotly)
## Warning: package 'plotly' was built under R version 4.0.5
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot <- df %>%
filter(artist == "Olivia Rodrigo") %>%
group_by(title) %>%
summarise(total_streams_percent = sum(streams)) %>%
mutate(total_streams_percent = total_streams_percent / sum(total_streams_percent) * 100) %>%
ggplot(aes(x = title, y = total_streams_percent, fill = title)) +
geom_col() +
theme_minimal() +
theme(axis.text.x = element_blank(), legend.position = "none")+
labs(title = "%-Distribution of Olivia Rodrigo streams", x = "Song", y = "%-Streams")
ggplotly(plot)
The Song “Drivers License” is the most popular
df %>%
filter(rank == 1) %>%
select(artist) %>%
n_distinct
## [1] 47
df %>%
filter(rank == 1) %>%
select(artist) %>%
group_by(artist) %>%
count() %>%
arrange(desc(n)) %>%
head()
## # A tibble: 6 x 2
## # Groups: artist [6]
## artist n
## <chr> <int>
## 1 Post Malone 174
## 2 Drake 163
## 3 Ed Sheeran 139
## 4 Ariana Grande 133
## 5 Tones And I 120
## 6 21 Savage 114
df %>%
filter(rank == 1) %>%
group_by(title) %>%
summarise(days_on_1 = n_distinct(date)) %>%
arrange(desc(days_on_1))
## # A tibble: 66 x 2
## title days_on_1
## <chr> <int>
## 1 Dance Monkey 120
## 2 rockstar 114
## 3 Señorita 102
## 4 Shape of You 97
## 5 Despacito - Remix 96
## 6 Blinding Lights 82
## 7 God's Plan 74
## 8 7 rings 68
## 9 drivers license 67
## 10 DÁKITI 60
## # ... with 56 more rows
df %>%
filter(rank == 1) %>%
group_by(artist) %>%
summarise(total_streams_percent = sum(streams)) %>%
mutate(total_streams_percent = total_streams_percent / sum(total_streams_percent)*100) %>%
arrange(desc(total_streams_percent))
## # A tibble: 47 x 2
## artist total_streams_percent
## <chr> <dbl>
## 1 Drake 8.12
## 2 Ed Sheeran 7.06
## 3 Post Malone 6.66
## 4 Ariana Grande 6.32
## 5 Olivia Rodrigo 5.83
## 6 Camila Cabello 5.76
## 7 Tones And I 5.71
## 8 Shawn Mendes 5.47
## 9 Justin Bieber 5.36
## 10 Luis Fonsi 4.63
## # ... with 37 more rows
df %>%
filter(rank == 1) %>%
group_by(artist) %>%
summarise(no1_tracks = n_distinct(title)) %>%
arrange(desc(no1_tracks)) %>%
head(10)
## # A tibble: 10 x 2
## artist no1_tracks
## <chr> <int>
## 1 Drake 7
## 2 Post Malone 6
## 3 Ariana Grande 4
## 4 Billie Eilish 4
## 5 Taylor Swift 4
## 6 The Weeknd 3
## 7 Bad Bunny 2
## 8 Camila Cabello 2
## 9 Daddy Yankee 2
## 10 Ed Sheeran 2
df_drake <- df %>%
filter(artist == "Drake")
df_drake %>%
select(artist, title) %>%
unique() %>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 101
Wow, Drake had 101 Songs in the charts. As we already know, 7 of them got to No. 1. But which songs are those?
df_drake %>%
filter(rank == 1) %>%
select(title) %>%
unique()
## # A tibble: 7 x 1
## title
## <chr>
## 1 What’s Next
## 2 Toosie Slide
## 3 In My Feelings
## 4 Don’t Matter To Me
## 5 Nonstop
## 6 Nice For What
## 7 God's Plan
Which of those songs was the first to reach No.1?
df_drake %>%
filter(rank == 1) %>%
group_by(title) %>%
summarise( date = min(date)) %>%
arrange(date)
## # A tibble: 7 x 2
## title date
## <chr> <date>
## 1 God's Plan 2018-01-21
## 2 Nice For What 2018-04-09
## 3 Nonstop 2018-06-29
## 4 Don’t Matter To Me 2018-07-02
## 5 In My Feelings 2018-07-08
## 6 Toosie Slide 2020-04-10
## 7 What’s Next 2021-03-05
God’s Plan was Drakes first No.1 Hit. It reached No.1 on 21.01.2018.
plot2 <- df %>%
filter(artist == "Drake",
title %in% (
df %>% filter(artist == "Drake", rank == 1) %>%
pull(title)
)
) %>%
ggplot(aes(x = date, y = rank, color= title)) +
geom_line(show.legend = FALSE) +
theme_minimal() +
scale_y_reverse()
ggplotly(plot2)
His first No.1 Hit “God’s Plan” was also the one that stayed in the charts the longest
drake_days <- df_drake %>%
group_by(title) %>%
summarise(days = n_distinct(date)) %>%
arrange(desc(days))
head(drake_days, 10)
## # A tibble: 10 x 2
## title days
## <chr> <int>
## 1 God's Plan 717
## 2 One Dance 674
## 3 Passionfruit 396
## 4 Money In The Grave (Drake ft. Rick Ross) 378
## 5 In My Feelings 364
## 6 Toosie Slide 325
## 7 Nonstop 306
## 8 Laugh Now Cry Later (feat. Lil Durk) 302
## 9 Nice For What 298
## 10 Fake Love 257
summary(drake_days)
## title days
## Length:101 Min. : 1.0
## Class :character 1st Qu.: 7.0
## Mode :character Median : 22.0
## Mean : 72.2
## 3rd Qu.: 81.0
## Max. :717.0
df %>%
group_by(title) %>%
summarise(max_rank = min(rank), mid = median(rank), min_rank = max(rank)) %>%
arrange(mid) %>%
inner_join(drake_days)
## Joining, by = "title"
## # A tibble: 101 x 5
## title max_rank mid min_rank days
## <chr> <int> <dbl> <int> <int>
## 1 Don’t Matter To Me 1 2 10 21
## 2 I'm Upset 6 46 200 118
## 3 Toosie Slide 1 56 200 325
## 4 Emotionless 3 59 185 34
## 5 8 Out Of 10 6 59.5 197 22
## 6 Laugh Now Cry Later (feat. Lil Durk) 2 61 200 302
## 7 Nonstop 1 62 199 306
## 8 Survival 2 62 184 22
## 9 In My Feelings 1 63 200 364
## 10 Pain 1993 (with Playboi Carti) 4 63 119 21
## # ... with 91 more rows